knitr::opts_chunk$set(fig.align='center', message=F, warning=F, cache=T,cache.lazy = F,
                      class.source="fold-hide")

Load libraries

library(tibble)
library(tidyr)
library(readr)
library(stringr)
library(ggplot2)
library(cowplot)
library(uwot)
library(Rphenograph)
library(pheatmap)
library(cytofkit)
library(needs)
library(dplyr)
library(knitr)
library(ggridges)
library(grDevices) # to change the color in pheatmap

prioritize(dplyr)

theme_set(theme_classic())

# if(!require(devtools)){
#   install.packages("devtools") # If not already installed
# }
# devtools::install_github("JinmiaoChenLab/Rphenograph")

Overview of the tutorial

In this tutorial we will analyse a public data set (Georg et al. 2021) of single cell phopho-proteomics measured with Cytometry by Time of Flight (CyTOF). In this study, the authors performed CyTOF of whole blood samples from mild and severe COVID-19 patients during the acute and convalescent phase, and patients with other acute respiratory infections (Flu-like illness), as well as patients chronically infected by human immunodeficiency virus (HIV) or hepatitis B (HBV) and healthy controls. They analysed the T cell space and identified highly activated CD16+ T cells in severe COVID-19, which led the authors to hypothesise about the pathological role of these cytotoxic T cells. This hypothesis was then tested and confirmed with functional analyses, and found suitable mechanisms for their induction. In this tutorial you will learn how to perform such computational analysis either in T cells, B cells, or monocytes!

Due to time constrains, we will analyse 5% of the data set. We will start by manually pre-gating the immune cell type of interest, then we will visually explore the data by reducing the dimensionality and plotting a UMAP. After this, we will cluster the data to find discrete communities of cells, using two different algorithms for comparison. Then we will annotate/give names to these clusters by looking at the average protein expression in each community/cluster. Finally, we will calculate the abundance of each cluster per donor and identify COVID-19 or severe-specific clusters.


Read the data

We start reading the data table “data_norm_sub.csv” with the function read.csv(). The data was already pre-processed (filter-out dead cells, doublets, debris, and batch-corrected).

data_norm_sub <- read.csv("~/Documents/INsTRuCT_workshop/data_norm_sub5.csv")

What are the columns? What are the rows?

colnames(data_norm_sub)
##  [1] "cellid"                  "Run"                    
##  [3] "FCS.Filename"            "id"                     
##  [5] "Individuals"             "Group"                  
##  [7] "Severity"                "Disease.phase"          
##  [9] "max..WHO.scale"          "sev_merge"              
## [11] "Days.post.symptom.onset" "Week"                   
## [13] "sev_week"                "followup"               
## [15] "CD45"                    "CD3"                    
## [17] "CD19"                    "CD15"                   
## [19] "CD8"                     "TCRgd"                  
## [21] "CD62L"                   "CD45RO"                 
## [23] "CD28"                    "CD27"                   
## [25] "CD226"                   "ICOS"                   
## [27] "PD1"                     "Lag3"                   
## [29] "TIGIT"                   "CD96"                   
## [31] "CD25"                    "CD56"                   
## [33] "HLADR"                   "CD38"                   
## [35] "CD137"                   "CD69"                   
## [37] "Ki67"                    "CXCR3"                  
## [39] "CXCR5"                   "CCR6"                   
## [41] "CRTH2"                   "KLRB1"                  
## [43] "KLRG1"                   "KLRF1"                  
## [45] "CD95"                    "CD10"                   
## [47] "CD16"                    "CD34"                   
## [49] "CD123"                   "CD11c"                  
## [51] "CD21"                    "CD14"                   
## [53] "IgD"                     "IgM"

Create a vector with the name of each measured protein (what we call “the CyTOF panel”).

panel <- colnames(data_norm_sub)[15:54]

color_severity <- c(
  "healthy" = "#0449FF",
  "FLI" =  "#807F7F",
  "HIV" = "#40007F",
  "HBV" =  "magenta",
  "mild/moderate" = "#FFB651",
  "severe/critical" = "#F82000")

Let’s look at how the values of markers are distributed:

data_norm_sub %>% 
    sample_frac(0.2) %>% 
    pivot_longer(names_to = "marker",values_to = "value",panel) %>%
    ggplot(aes(value)) +
    geom_density() + facet_wrap(~marker, scale = "free") + 
  theme_classic()

We notice that these are skewed distributions: Many small values, some very large values. Therefore, it makes more sense to look at these on a logarithmic scale:

data_norm_sub %>% 
    sample_frac(0.2) %>% 
    pivot_longer(names_to = "marker",values_to = "value",panel) %>%
    mutate(log_value = log(value+1)) %>% 
    ggplot(aes(log_value)) +
    geom_density() + facet_wrap(~marker, scale = "free")+ 
    theme_classic()

In the CyTOF community, people commonly use the hypebolic arcsine (arcsinh) transformation: \[ \rm arsinh (x) = \ln(x + \sqrt{x^2+1}) \]


Pre-gating of B cells

Using ggplot() and geom_point(), generate a scatter plot to decide the gates.
We visualize just 10% of the data.

CD45+CD19+

data_norm_sub %>% 
 dplyr::filter(CD19>0, CD45>0) %>% 
 sample_frac(0.1) %>% 
 mutate_at(vars(panel),asinh) %>% 
 dplyr::filter(CD45>0, CD19>0) %>% 
 ggplot(aes(x=CD45, y=CD19)) +
 geom_point(size = 0.01, alpha = 0.1) +
 geom_density_2d()+
 geom_rect(mapping=aes(xmin=3, xmax=8, ymin=4, ymax=10), color="black", alpha=0) + 
 theme_classic()

CD45+CD19+CD3-CD15-

data_norm_sub %>% 
 mutate_at(vars(panel),asinh) %>% 
 dplyr::filter(CD19>4 & CD19<10,
               CD45>3 & CD45<8,
               CD15>0, CD3>0) %>% 
 ggplot(aes(x=CD3, y=CD15)) +
 geom_point(size = 0.01, alpha = 0.1) +
 geom_density_2d()+
 geom_rect(mapping=aes(xmin=0, xmax=3.6, ymin=0, ymax=3), color="black", alpha=0) 

Percentage of monocytes

We first add a column to the data table where each cell gets the classification Bcell = {TRUE, FALSE} according to your gating strategy. Then let’s see if the percentage of B cells make sense and how does it look per disease group (variable sev_merge).

data_norm_sub <- data_norm_sub %>% 
  mutate(Bcells = ifelse(CD19>sinh(4) & 
                          CD19<sinh(10) & 
                          CD45>sinh(3) & 
                          CD45<sinh(8) & 
                          CD3<sinh(3.6) & 
                          CD15<sinh(3), TRUE, FALSE))
data_norm_sub <- data_norm_sub %>% mutate(sev_merge = factor(sev_merge,levels = c("healthy","FLI","HIV","HBV","mild/moderate","severe/critical")))

data_norm_sub %>% 
 count(id,Bcells,sev_merge,Disease.phase) %>% 
 group_by(id) %>% 
 mutate(perc = n/sum(n)*100) %>% 
 ungroup() %>% 
 filter(Bcells) %>% 
 ggplot(aes(y = perc, x = sev_merge, fill = sev_merge)) + 
 geom_boxplot(position=position_dodge(1), alpha = 0.7)+
 geom_dotplot(binaxis='y', stackdir='center',
              position=position_dodge(1), alpha = 0.7)+
 facet_grid(~ Disease.phase, space = "free_x", scale = "free_x") + 
 scale_fill_manual(values = color_severity)+
 ylab("Percentage of B cells (%)") + 
 xlab("")+
 theme_classic()+ 
 theme(axis.text.x=element_blank(),
       axis.ticks.x=element_blank())


UMAP

We now compute UMAP for B cells across all samples (acute and convalescent), and using all markers, except CD45, CD3, CD19, CD15, CD8, TCRgd and CD14.

We define a vector “sel_markers_Bcells” with the selected markers to be used for the calculation of the UMAP.

sel_markers_Bcells <- panel[!panel %in% c("CD45","CD3", "CD19","CD15","CD8", "TCRgd" ,"CD14")]

Before calculating the UMAP, is important we transform our data (asinh) and apply z-score normalization (function scale). Why?

data_Bcells <- data_norm_sub %>%
  filter(Bcells)

UMAP_Bcells <- data_Bcells %>%
  mutate_at(vars(sel_markers_Bcells), asinh) %>%
  mutate_at(vars(sel_markers_Bcells), scale) %>%
  select(sel_markers_Bcells) %>%
  uwot::umap(n_neighbors = 30,spread = 1, min_dist = 0.5,metric = "euclidean", verbose = TRUE, fast_sgd = TRUE)

data_Bcells$UMAP1 <- NA
data_Bcells$UMAP2 <- NA

data_Bcells$UMAP1 <- UMAP_Bcells[,1]
data_Bcells$UMAP2 <- UMAP_Bcells[,2]

We now plot each UMAP, coloured by severity, disease phase, and intensity of a selected marker. Recommendation: subsample cells for visualization, using sample_n, or sample_frac.

Severity

Do you observe specific areas where CV19 samples accumulate? What does this mean?

data_Bcells %>% 
  ggplot(aes(x = UMAP1, y=UMAP2, color = sev_merge)) +
  geom_point(alpha = 0.5,size = 0.5)+
  guides(colour = guide_legend(ncol = 1,override.aes = list(size=3, alpha = 1))) +
  scale_color_manual(values = color_severity, name = "")+
  theme_classic() + 
  ggtitle("B cells") +
  theme(legend.position = "none",plot.title = element_text(hjust = 0.5)) 

Disease phase

Do you observe specific areas where convalescent samples accumulate? What does this mean?

data_Bcells %>%
  ggplot(aes(x = UMAP1, y=UMAP2, color = Disease.phase)) +
  geom_point(alpha = 0.5,size = 0.5)+
  guides(colour = guide_legend(ncol = 1,override.aes = list(size=3, alpha = 1))) +
  scale_color_manual(values = c("acute" = "red","convalescent"="black"), name = "")+
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5))

Marker intensity

  • Plot the UMAPs coloured by the expression of your favorite marker. Remember to do the corresponding transformations on the intensity values.
p1 <- data_Bcells %>%
  mutate_at(vars(sel_markers_Bcells), asinh) %>%
  mutate_at(vars(sel_markers_Bcells), scale) %>%
  ggplot(aes(x = UMAP1, y=UMAP2, color = CD34)) +
  geom_point(alpha = 0.5,size = 0.5)+
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_color_gradient2(low = "blue", mid = "grey", high = "red", midpoint = 0)


p2 <- data_Bcells %>%
  mutate_at(vars(sel_markers_Bcells), asinh) %>%
  mutate_at(vars(sel_markers_Bcells), scale) %>%
  ggplot(aes(x = UMAP1, y=UMAP2, color = IgM)) +
  geom_point(alpha = 0.5,size = 0.5)+
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_color_gradient2(low = "blue", mid = "grey", high = "red", midpoint = 0)


p3 <- data_Bcells %>%
  mutate_at(vars(sel_markers_Bcells), asinh) %>%
  mutate_at(vars(sel_markers_Bcells), scale) %>%
  ggplot(aes(x = UMAP1, y=UMAP2, color = CD38)) +
  geom_point(alpha = 0.5,size = 0.5)+
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_color_gradient2(low = "blue", mid = "grey", high = "red", midpoint = 0)

plot_grid(p1,p2,p3,nrow = 1)


Unsupervised clustering

We now perform unsupervised clustering analysis on samples from control, FLI, HIV, HBV, and acute COVID-19 using the selected markers. As done for the UMAP calculation, before clustering is important we transform and z-score normalize our data. Let’s look at the distribution of the markers used for clustering:

data_Bcells %>% 
 mutate_at(vars(sel_markers_Bcells), asinh) %>% 
 mutate_at(vars(sel_markers_Bcells), scale) %>% 
 filter(Disease.phase == "acute") %>%
 select(sel_markers_Bcells) %>% 
 pivot_longer(names_to = "marker", values_to = "value", everything()) %>% 
 ggplot(aes(value)) + 
 geom_density() + facet_wrap(~marker, scale = "free") + theme_classic()

What’s the main difference between RPhenograph and FlowSOM?

Run

RPhenograph

IMPORTANT! In RPhenograph we define the number of nearest neighbours.

start_time <- Sys.time()


clust_bcells_rpheno <- data_Bcells %>% 
 mutate_at(vars(sel_markers_Bcells), asinh) %>% 
 mutate_at(vars(sel_markers_Bcells), scale) %>% 
 filter(Disease.phase == "acute") %>%
 select(sel_markers_Bcells) %>% 
 Rphenograph(k = 30)

end_time <- Sys.time()
time_elapsed <- round(as.numeric(gsub('Time difference of ', '', difftime(end_time, start_time, units = "mins"))), 2)
print(paste('\n', time_elapsed, 'minutes passed'))
## [1] "\n 0.42 minutes passed"

After running the clustering algorithm, we add a column “Rpheno” in the data table with the cluster label for each cell:

clust_ids <- data_Bcells %>% 
   filter(Disease.phase == "acute") %>%
   pull(cellid)

clust_bcells_rpheno <- tibble(cellid = clust_ids, Rpheno = as.character(membership(clust_bcells_rpheno)))

data_Bcells <- data_Bcells %>% left_join(clust_bcells_rpheno)
data_Bcells <- data_Bcells %>% mutate(Rpheno = factor(Rpheno, levels = str_sort(unique(Rpheno), numeric = TRUE)))

FlowSOM

IMPORTANT! In FlowSOM we define the number of clusters.

start_time <- Sys.time()

clust_bcells_flowsom <- data_Bcells %>% 
 mutate_at(vars(sel_markers_Bcells), asinh) %>% 
 mutate_at(vars(sel_markers_Bcells), scale) %>% 
 filter(Disease.phase == "acute") %>%
 select(sel_markers_Bcells) %>% 
 cytof_cluster(xdata=. , method = 'FlowSOM', FlowSOM_k = 11)

end_time <- Sys.time()
time_elapsed <- round(as.numeric(gsub('Time difference of ', '', difftime(end_time, start_time, units = "mins"))), 2)
print(paste('\n', time_elapsed, 'minutes passed'))
## [1] "\n 0.04 minutes passed"

After running the clustering algorithm, we add a column “flowsom” in the data table with the cluster label for each cell:

clust_ids <- data_Bcells %>%
  filter(Disease.phase == "acute") %>%
  pull(cellid)

clust_bcells_flowsom <- tibble(cellid = clust_ids, flowsom = as.character(clust_bcells_flowsom))
data_Bcells <- data_Bcells %>% left_join(clust_bcells_flowsom)
data_Bcells <- data_Bcells %>% mutate(flowsom = factor(flowsom, levels = str_sort(unique(flowsom), numeric = TRUE)))

Cluster size

We now visualize the number of cells in each cluster:

Rphenograph

 data_Bcells %>% filter(Disease.phase == "acute") %>%
 count(Rpheno) %>% 
 ggplot(aes(x = Rpheno, y = n, label = n))+
 geom_col(position = "dodge") + 
 theme_classic()+ 
 geom_label() 

FlowSOM

 data_Bcells %>% filter(Disease.phase == "acute") %>%
 count(flowsom) %>% 
 ggplot(aes(x = flowsom, y = n, label = n))+
 geom_col(position = "dodge") + 
  theme_classic()+ 
  geom_label() 

UMAP clusters

And then plot the UMAP, this time coloured by cluster:

RPhenograph

data_Bcells %>% 
  filter(data_Bcells$Disease.phase == "acute") %>% 
  ggplot(aes(x = UMAP1, y=UMAP2, color = Rpheno)) +
  geom_point(alpha = 0.5,size = 1)+
  guides(colour = guide_legend(ncol = 1,override.aes = list(size=3, alpha = 1))) +
  theme_classic() + 
  ggtitle("B cells") +
  theme(plot.title = element_text(hjust = 0.5)) 

FlowSOM

data_Bcells %>% 
  filter(data_Bcells$Disease.phase == "acute") %>% 
  ggplot(aes(x = UMAP1, y=UMAP2, color = flowsom)) +
  geom_point(alpha = 0.5,size = 1)+
  guides(colour = guide_legend(ncol = 1,override.aes = list(size=3, alpha = 1))) +
  theme_classic() + 
  ggtitle("B cells") +
  theme(plot.title = element_text(hjust = 0.5)) 

Marker distribution

Let’s look at the marker distribution per cluster:

RPhenograph

data_Bcells %>% 
 mutate_at(vars(sel_markers_Bcells), asinh) %>% 
 mutate_at(vars(sel_markers_Bcells), scale) %>% 
 filter(Disease.phase == "acute") %>%
 select(sel_markers_Bcells, Rpheno) %>% 
 pivot_longer(names_to = "marker", values_to = "value", sel_markers_Bcells) %>% 
 ggplot(aes(x= value, fill = Rpheno, y = Rpheno)) + 
 geom_density_ridges() + facet_wrap(~marker, scale = "free") + 
 theme_classic()

FlowSOM

data_Bcells %>% 
 mutate_at(vars(sel_markers_Bcells), asinh) %>% 
 mutate_at(vars(sel_markers_Bcells), scale) %>% 
 filter(Disease.phase == "acute") %>%
 select(sel_markers_Bcells, flowsom) %>% 
 pivot_longer(names_to = "marker", values_to = "value", sel_markers_Bcells) %>% 
 ggplot(aes(x= value, fill = flowsom, y = flowsom)) + 
 geom_density_ridges() + facet_wrap(~marker, scale = "free") + 
 theme_classic()


Cluster annotation

We now want to actually understand what are these clusters we found. For this, we can visualize the average expression of each marker in each cluster with a heatmap.** Remember to do the corresponding transformation of the intensity values.**

Heatmap (ggplot)

Rphenograph

data_Bcells %>%
 mutate_at(vars(sel_markers_Bcells), asinh) %>%
 mutate_at(vars(sel_markers_Bcells), scale) %>%
 filter(Disease.phase == "acute") %>%
 select(sel_markers_Bcells,Rpheno) %>%
 group_by(Rpheno) %>%
 summarise_at(vars(sel_markers_Bcells), funs(mean(., na.rm=TRUE))) %>%
 pivot_longer(names_to = "markers", values_to = "avg_zscore", - Rpheno) %>%
 mutate(markers = factor(markers,levels = rev(sel_markers_Bcells))) %>%
 ggplot(aes(x = Rpheno, y = markers, fill = avg_zscore)) +
 geom_tile() +
 scale_fill_gradient2(low = "blue", mid = "white", high = "red", limits = c(-3,7.5))

FlowSOM

data_Bcells %>% 
 mutate_at(vars(sel_markers_Bcells), asinh) %>% 
 mutate_at(vars(sel_markers_Bcells), scale) %>% 
 filter(Disease.phase == "acute") %>%
 select(sel_markers_Bcells,flowsom) %>%
 group_by(flowsom) %>%
 summarise_at(vars(sel_markers_Bcells), funs(mean(., na.rm=TRUE))) %>% 
 pivot_longer(names_to = "markers", values_to = "avg_zscore", - flowsom) %>% 
 mutate(markers = factor(markers,levels = sel_markers_Bcells)) %>% 
 ggplot(aes(x = flowsom, y = markers, fill = avg_zscore)) + 
 geom_tile() + 
 scale_fill_gradient2(low = "blue", mid = "white", high = "red", limits = c(-3,7.5))

Heatmap (pheatmap)

With the library “pheatmap” we can additionally group the clusters (dendogram) and group the markers by category (annotation).

RPhenograph

data_heatmap <- data_Bcells %>%
 mutate_at(vars(sel_markers_Bcells), asinh) %>%
 mutate_at(vars(sel_markers_Bcells), scale) %>%
 filter(Disease.phase == "acute") %>%
 select(sel_markers_Bcells,Rpheno) %>%
 group_by(Rpheno) %>%
 summarise_at(vars(sel_markers_Bcells), funs(mean(., na.rm=TRUE))) %>%
 column_to_rownames("Rpheno")

annotation_rows <- data.frame( marker_category = c(rep("Differentiation",2), 
                               rep("Co-stimulation",4), 
                               rep("Co-inhibition",4),
                               rep("Activation",7),
                               rep("Chemokine receptor",4),
                               rep("Others",12)))

rownames(annotation_rows) <- sel_markers_Bcells

data_heatmap %>% t() %>%
  pheatmap(cluster_rows = F, 
           cluster_cols = T, 
           annotation_row = annotation_rows, 
           annotation_names_row = F, 
           gaps_row = c(2, 6, 10, 17, 21), 
           breaks = seq(-4, 4, length.out = 100),
           color = colorRampPalette(c("blue", "white", "red"))(100)
)

FlowSOM

data_heatmap <- data_Bcells %>%
 mutate_at(vars(sel_markers_Bcells), asinh) %>%
 mutate_at(vars(sel_markers_Bcells), scale) %>%
 filter(Disease.phase == "acute") %>%
 select(sel_markers_Bcells,flowsom) %>%
 group_by(flowsom) %>%
 summarise_at(vars(sel_markers_Bcells), funs(mean(., na.rm=TRUE))) %>%
 column_to_rownames("flowsom")

annotation_rows <- data.frame( marker_category = c(rep("Differentiation",2), 
                               rep("Co-stimulation",4), 
                               rep("Co-inhibition",4),
                               rep("Activation",7),
                               rep("Chemokine receptor",4),
                               rep("Others",12)))

rownames(annotation_rows) <- sel_markers_Bcells

data_heatmap %>% t() %>%
  pheatmap(cluster_rows = F, 
           cluster_cols = T, 
           annotation_row = annotation_rows, 
           annotation_names_row = F, 
           gaps_row = c(2, 6, 10, 17, 21), 
           breaks = seq(-4, 4, length.out = 100),
           color = colorRampPalette(c("blue", "white", "red"))(100)
)

Quick comparison

How much percentage of cells from RPhenograph cluster X where classified in FlowSOM cluster Y?

data_heatmap <- data_Bcells %>%  
    filter(Disease.phase == "acute") %>%
    count(Rpheno, flowsom) %>% 
    tidyr::complete(Rpheno,flowsom,fill = list(n=0)) %>% 
    group_by(Rpheno) %>% 
    mutate(total_Rpheno = sum(n)) %>% 
    ungroup() %>% mutate(perc = n/total_Rpheno*100) %>% select(Rpheno, flowsom, perc) %>% 
    pivot_wider(names_from = Rpheno, values_from = "perc") %>% 
    column_to_rownames("flowsom") 


pheatmap(data_heatmap, cluster_cols = T,cluster_rows = T,
         labels_row = paste0(rownames(data_heatmap), "_flowsom"),
         labels_col = paste0(rownames(data_heatmap), "_rpheno"))


Cluster abundance

Finally, we can calculate the abundance of each cluster in each sample and check if there are COVID-specific or severity-specific groups of cells.

ACHTUNG! In this data set, donors where sampled multiple times during the disease course. We establish an arbitrary rule of choosing the first sample per donor (usually during the first week post-symptom onset) if multiple samples are available.

selected_ids <- data_Bcells %>% 
 filter(Disease.phase == "acute") %>% 
 count(Individuals,id,Group,sev_merge,Days.post.symptom.onset) %>% 
 select(-n) %>% 
 group_by(Individuals) %>%
 mutate(n_samples = 1:n()) %>% 
 mutate(select_id = ifelse(n_samples == 1, TRUE,
                                  ifelse(min(as.numeric(Days.post.symptom.onset)) == as.numeric(Days.post.symptom.onset), 
                                         TRUE,
                                         FALSE))) %>%
 filter(select_id) %>%
 pull(id)

We can visualize the abundance of each cluster per donor as boxplots per severity group.

RPhenograph

 data_Bcells %>% 
 filter(id %in% selected_ids) %>% 
 count(id,sev_merge,Rpheno) %>% 
 group_by(id) %>% 
 mutate(perc = n/sum(n)*100) %>% 
 ungroup() %>% 
  # When using the function count(), if a cluster is absent in a donor, it will not be counted as zero. 
  # So we complete the count table by filling the missing clusters with a 0.
 tidyr::complete(id,Rpheno,fill = list(n=0,perc = 0)) %>% 
 group_by(id) %>% 
 fill(sev_merge, .direction = "downup") %>% 
 ungroup() %>% 
 mutate(perc = ifelse(is.na(perc),0,perc)) %>% 
 ggplot(aes(x = sev_merge, y = perc, fill = sev_merge)) + 
 geom_boxplot() + 
 geom_jitter()+
 facet_wrap(~Rpheno, scale = "free") + 
 scale_fill_manual(values = color_severity) + 
 theme(axis.text.x = element_blank()) + 
 ggtitle("RPhenograph cluster abundance")

FlowSOM

 data_Bcells %>% 
 filter(id %in% selected_ids) %>% 
 count(id,sev_merge,flowsom) %>% 
 group_by(id) %>% 
 mutate(perc = n/sum(n)*100) %>% 
 ungroup() %>% 
  # When using the function count(), if a cluster is absent in a donor, it will not be counted as zero. 
  # So we complete the count table by filling the missing clusters with a 0.
 tidyr::complete(id,flowsom,fill = list(n=0,perc = 0)) %>%
 group_by(id) %>%
 fill(sev_merge, .direction = "downup") %>%
 ungroup() %>%
 mutate(perc = ifelse(is.na(perc),0,perc)) %>%
 ggplot(aes(x = sev_merge, y = perc, fill = sev_merge)) + 
 geom_boxplot() + 
 geom_jitter()+
 facet_wrap(~flowsom, scale = "free") + 
 scale_fill_manual(values = color_severity) + 
 theme(axis.text.x = element_blank()) +
  ggtitle('FlowSom cluster abundance')


Now is your turn

Make a presentation/talk (up to 20 minutes long, 10 minutes discussion) about the analysis pipeline that you familiarized yourself with. Mention what cell type you looked at, and present the results with your interpretation given the context of the experimental design.

Doesn’t have to be a proper PowerPoint, you can just open the R Markdown and go through it.

Following questions should be touched upon: